1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data

data('gravier', package = 'datamicroarray')
table(gravier$y)
#> 
#> good poor 
#>  111   57

gravierset <- as.data.frame(cbind(class=1*(gravier$y=="poor"),gravier$x))

gravier <- NULL

1.2.0.1 Standarize the names for the reporting

studyName <- "GRAVIER"
dataframe <- gravierset
outcome <- "class"

TopVariables <- 10

1.2.1 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
168 2905
pander::pander(table(dataframe[,outcome]))
0 1
111 57

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

1.2.2 Scaling the data


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.9999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.3 The heatmap of the data



hm <- heatMaps(data=dataframe,
               Outcome=outcome,
               Scale=TRUE,
               hCluster = "row",
               xlab="Feature",
               ylab="Sample",
               cexCol=0.15,
               cexRow=0.25
               )

par(op)

1.3.0.1 Correlation Matrix of the Data

The heat map of the data


par(cex=0.6,cex.main=0.85,cex.axis=0.7)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0
gplots::heatmap.2(abs(cormat),
                  trace = "none",
#                  scale = "row",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Original Correlation",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")

1.4 The decorrelation

DEdataframe <- IDeA(dataframe,verbose=TRUE)
#> 
#>  Included: 2905 , Uni p: 0.005278412 , Uncorrelated Base: 2315 , Outcome-Driven Size: 0 , Base Size: 2315 
#> 
#> 
 1 <R=0.965,w=  1,N=    2>, Top: 1( 1 )[ 1 : 1 : 0.957 ]( 1 , 1 , 0 ),<|>Tot Used: 2 , Added: 1 , Zero Std: 0 , Max Cor: 0.956
#> 
 2 <R=0.956,w=  2,N=   35>, Top: 12( 3 )[ 1 : 12 : 0.928 ]( 12 , 20 , 1 ),<|>Tot Used: 33 , Added: 20 , Zero Std: 0 , Max Cor: 0.925
#> 
 3 <R=0.925,w=  2,N=   35>, Top: 16( 1 )[ 1 : 16 : 0.912 ]( 16 , 18 , 12 ),<|>Tot Used: 64 , Added: 18 , Zero Std: 0 , Max Cor: 0.912
#> 
 4 <R=0.912,w=  2,N=   35>, Top: 13( 1 )[ 1 : 13 : 0.906 ]( 13 , 14 , 25 ),<|>Tot Used: 85 , Added: 14 , Zero Std: 0 , Max Cor: 0.905
#> 
 5 <R=0.905,w=  2,N=   35>, Top: 7( 1 )[ 1 : 7 : 0.903 ]( 6 , 6 , 34 ),<|>Tot Used: 94 , Added: 6 , Zero Std: 0 , Max Cor: 0.902
#> 
 6 <R=0.902,w=  2,N=   35>, Top: 5( 1 )[ 1 : 5 : 0.901 ]( 5 , 5 , 38 ),<|>Tot Used: 102 , Added: 5 , Zero Std: 0 , Max Cor: 0.900
#> 
 7 <R=0.900,w=  2,N=   35>, Top: 2( 1 )[ 1 : 2 : 0.900 ]( 2 , 2 , 41 ),<|>Tot Used: 105 , Added: 2 , Zero Std: 0 , Max Cor: 0.900
#> 
 8 <R=0.900,w=  3,N=  269>, Top: 87( 5 )[ 1 : 87 : 0.850 ]( 87 , 148 , 43 ),<|>Tot Used: 312 , Added: 148 , Zero Std: 0 , Max Cor: 0.897
#> 
 9 <R=0.897,w=  3,N=  269>, Top: 15( 2 )[ 1 : 15 : 0.849 ]( 15 , 18 , 114 ),<|>Tot Used: 342 , Added: 18 , Zero Std: 0 , Max Cor: 0.848
#> 
 10 <R=0.848,w=  4,N=  484>, Top: 166( 1 ).[ 1 : 166 : 0.800 ]( 159 , 229 , 126 ),<|>Tot Used: 666 , Added: 229 , Zero Std: 0 , Max Cor: 0.839
#> 
 11 <R=0.839,w=  4,N=  484>, Top: 14( 1 )[ 1 : 14 : 0.800 ]( 14 , 22 , 247 ),<|>Tot Used: 695 , Added: 22 , Zero Std: 0 , Max Cor: 0.804
#> 
 12 <R=0.804,w=  5,N=    2>, Top: 1( 1 )[ 1 : 1 : 0.800 ]( 1 , 1 , 258 ),<|>Tot Used: 697 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#> 
 13 <R=0.000,w=  6,N=    0>
#> 
 [ 13 ], 0.7978739 Decor Dimension: 697 . Cor to Base: 407 , ABase: 178 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

3891

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

3463

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

5

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

4.8

1.4.1 The decorrelation matrix


par(cex=0.6,cex.main=0.85,cex.axis=0.7)

UPSTM <- attr(DEdataframe,"UPSTM")

gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Decorrelation matrix",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="|Beta|>0",
                  xlab="Output Feature", ylab="Input Feature")


par(op)

1.5 The heatmap of the decorrelated data


hm <- heatMaps(data=DEdataframe,
               Outcome=outcome,
               Scale=TRUE,
               hCluster = "row",
               cexRow = 0.15,
               cexCol = 0.15,
               xlab="Feature",
               ylab="Sample")

par(op)

1.6 The correlation matrix after decorrelation


cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Correlation after IDeA",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")


par(op)

print(max(abs(cormat)))

[1] 0.7999088

1.7 U-MAP Visualization of features

1.7.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

1.7.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

1.8 Univariate Analysis

1.8.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : g1A07 200 : g3H03 300 : g1E07 400 : g4F03 500 : g4C05
600 : g1int292 700 : g1int356 800 : g7A03 900 : g4D11 1000 : g1int577
1100 : g1CNS420 1200 : g7G07 1300 : g1int785 1400 : g1CNS59 1500 : g1CNS178
1600 : g1int949 1700 : g1int1028 1800 : g1int1089 1900 : g11D05 2000 : g1int1222
2100 : g1int1298 2200 : g1int1376 2300 : g1int1449 2400 : g10E08 2500 : g1CNS90
2600 : g7F11 2700 : g1int1693 2800 : g1CNS93 2900 : g1int1800




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : g1A07 200 : g3H03 300 : La_g1E07 400 : g4F03 500 : g4C05
600 : g1int292 700 : g1int356 800 : g7A03 900 : g4D11 1000 : g1int577
1100 : g1CNS420 1200 : La_g7G07 1300 : g1int785 1400 : La_g1CNS59 1500 : g1CNS178
1600 : g1int949 1700 : g1int1028 1800 : La_g1int1089 1900 : g11D05 2000 : g1int1222
2100 : g1int1298 2200 : g1int1376 2300 : La_g1int1449 2400 : g10E08 2500 : g1CNS90
2600 : g7F11 2700 : g1int1693 2800 : La_g1CNS93 2900 : g1int1800

1.8.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
g1CNS507 -0.812 0.941 0.1840 0.855 0.11198 0.796
g1CNS105 0.832 1.160 -0.0863 0.836 0.01027 0.749
g1CNS382 -0.745 1.076 0.1514 0.940 0.12716 0.745
g1int804 -0.656 0.949 0.1149 0.889 0.02827 0.743
g1CNS91 0.887 1.174 -0.0348 0.695 0.00370 0.742
g1CNS26 -0.722 1.108 0.1852 0.855 0.93893 0.738
g1int340 -0.957 1.291 0.1553 1.165 0.08048 0.737
g1CNS70 0.898 1.293 -0.0759 0.978 0.06554 0.731
g1CNS158 0.846 1.168 0.0216 0.890 0.00114 0.731
g1CNS28 0.838 1.180 -0.0334 0.916 0.00650 0.726



finalTable <- univarDe$orderframe[topvar,univariate_columns]
pander::pander(univarDe$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
g1CNS507 -0.812 0.941 0.1840 0.855 0.11198 0.796
g1CNS105 0.832 1.160 -0.0863 0.836 0.01027 0.749
g1int340 -0.957 1.291 0.1553 1.165 0.08048 0.737
g1CNS158 0.846 1.168 0.0216 0.890 0.00114 0.731
g8D02 -0.776 1.372 0.2052 1.194 0.14136 0.725
g1int1671 0.614 1.133 -0.1787 1.055 0.20931 0.719
g9E01 0.876 1.334 -0.0751 1.067 0.08676 0.717
g1int812 -0.571 1.013 0.1304 0.840 0.27965 0.716
g1CNS74 0.645 1.185 -0.1661 1.093 0.41448 0.716
g8F04 0.511 0.943 -0.2328 0.953 0.80241 0.710

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2 484 0.167


dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")


pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
g1CNS507 -0.812 0.941 0.1840 0.855 0.11198 0.796 0.796 1
g1CNS105 0.832 1.160 -0.0863 0.836 0.01027 0.749 0.749 3
g1int340 -0.957 1.291 0.1553 1.165 0.08048 0.737 0.737 NA
g1CNS158 0.846 1.168 0.0216 0.890 0.00114 0.731 0.731 11
g8D02 -0.776 1.372 0.2052 1.194 0.14136 0.725 0.725 NA
g1int1671 0.614 1.133 -0.1787 1.055 0.20931 0.719 0.719 NA
g9E01 0.876 1.334 -0.0751 1.067 0.08676 0.717 0.717 NA
g1int812 -0.571 1.013 0.1304 0.840 0.27965 0.716 0.716 11
g1CNS74 0.645 1.185 -0.1661 1.093 0.41448 0.716 0.716 NA
g8F04 0.511 0.943 -0.2328 0.953 0.80241 0.710 0.710 NA